In [ ]:
In [ ]:
In [449]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
In [450]:
import os
In [451]:
os.listdir(r"/Users/mani/Documents/Data analytics projects/Datasets/")
Out[451]:
['other-Lyft_B02510.csv', 'other-FHV-services_jan-aug-2015.csv', 'other-Firstclass_B01536.csv', 'other-Skyline_B00111.csv', 'uber-raw-data-janjune-15_sample.csv', 'uber-raw-data-janjune-15.csv', 'other-American_B01362.csv', 'uber-raw-data-apr14.csv', 'Uber-Jan-Feb-FOIL.csv', 'other-Highclass_B01717.csv', 'uber-raw-data-aug14.csv', 'uber-raw-data-sep14.csv', 'uber-raw-data-jul14.csv', 'other-Federal_02216.csv', 'uber-raw-data-jun14.csv', 'other-Carmel_B00256.csv', 'other-Diplo_B01196.csv', 'other-Dial7_B00887.csv', 'uber-raw-data-may14.csv', 'other-Prestige_B01338.csv']
In [ ]:
In [452]:
uber_15 = pd.read_csv(r"/Users/mani/Documents/Data analytics projects/Datasets/uber-raw-data-janjune-15_sample.csv")
In [453]:
uber_15.shape
Out[453]:
(100000, 4)
In [ ]:
In [454]:
type(uber_15)
Out[454]:
pandas.core.frame.DataFrame
In [ ]:
In [455]:
uber_15.duplicated().sum()
Out[455]:
54
In [ ]:
In [456]:
print("The dataset contained 54 duplicated entries, which were removed to ensure the integrity of the analysis. After removing duplicates, the dataset now has 99,946 rows.")
The dataset contained 54 duplicated entries, which were removed to ensure the integrity of the analysis. After removing duplicates, the dataset now has 99,946 rows.
In [ ]:
In [457]:
uber_15.drop_duplicates(inplace=True)
In [458]:
uber_15.duplicated().sum()
Out[458]:
0
In [459]:
uber_15.shape
Out[459]:
(99946, 4)
In [ ]:
In [460]:
uber_15.dtypes
Out[460]:
Dispatching_base_num object Pickup_date object Affiliated_base_num object locationID int64 dtype: object
In [ ]:
In [461]:
print("The dataset columns have a mix of data types, including object for categorical variables and int64 for numerical variables. The Pickup_date was originally of type object but was later converted to datetime for time-series analysis.")
The dataset columns have a mix of data types, including object for categorical variables and int64 for numerical variables. The Pickup_date was originally of type object but was later converted to datetime for time-series analysis.
In [ ]:
In [462]:
uber_15.isnull().sum()
Out[462]:
Dispatching_base_num 0 Pickup_date 0 Affiliated_base_num 1116 locationID 0 dtype: int64
In [463]:
print("There are 1,116 missing values in the Affiliated_base_num column, which represents the affiliated base for each Uber ride. All other columns have no missing data, indicating completeness in other dimensions of the dataset.")
There are 1,116 missing values in the Affiliated_base_num column, which represents the affiliated base for each Uber ride. All other columns have no missing data, indicating completeness in other dimensions of the dataset.
In [ ]:
In [464]:
uber_15['Pickup_date'][0]
Out[464]:
'2015-05-02 21:43:00'
In [465]:
#The Pickup_date was initially in string format but was successfully converted to a datetime format.
In [466]:
type(uber_15['Pickup_date'][0])
Out[466]:
str
In [467]:
uber_15['Pickup_date']= pd.to_datetime(uber_15['Pickup_date'])
In [ ]:
In [468]:
print("The Pickup_date column was successfully converted from a string to a datetime format, allowing for further time-based analysis, such as extracting the hour, weekday, and month.")
The Pickup_date column was successfully converted from a string to a datetime format, allowing for further time-based analysis, such as extracting the hour, weekday, and month.
In [ ]:
In [469]:
uber_15['Pickup_date'].dtype
Out[469]:
dtype('<M8[ns]')
In [470]:
uber_15['Pickup_date'][0]
Out[470]:
Timestamp('2015-05-02 21:43:00')
In [471]:
type(uber_15['Pickup_date'][0])
Out[471]:
pandas._libs.tslibs.timestamps.Timestamp
In [472]:
uber_15.dtypes
Out[472]:
Dispatching_base_num object Pickup_date datetime64[ns] Affiliated_base_num object locationID int64 dtype: object
In [ ]:
In [473]:
uber_15['month']=uber_15['Pickup_date'].dt.month_name()
In [474]:
uber_15['month']
Out[474]:
0 May
1 January
2 March
3 April
4 March
...
99995 April
99996 March
99997 March
99998 May
99999 June
Name: month, Length: 99946, dtype: object
In [475]:
uber_15['month'].value_counts().plot(kind='bar')
Out[475]:
<Axes: xlabel='month'>
In [476]:
print("The bar chart illustrates the distribution of Uber rides across different months. This plot can help identify any seasonal trends or variations in Uber ride demand throughout the year, showing peaks and dips in ride counts per month.")
The bar chart illustrates the distribution of Uber rides across different months. This plot can help identify any seasonal trends or variations in Uber ride demand throughout the year, showing peaks and dips in ride counts per month.
In [ ]:
In [477]:
uber_15['weekday'] = uber_15['Pickup_date'].dt.day_name()
uber_15['day'] = uber_15['Pickup_date'].dt.day
uber_15['hour'] = uber_15['Pickup_date'].dt.hour
uber_15['minute'] = uber_15['Pickup_date'].dt.minute
In [478]:
uber_15.head(4)
Out[478]:
| Dispatching_base_num | Pickup_date | Affiliated_base_num | locationID | month | weekday | day | hour | minute | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | B02617 | 2015-05-02 21:43:00 | B02764 | 237 | May | Saturday | 2 | 21 | 43 |
| 1 | B02682 | 2015-01-20 19:52:59 | B02682 | 231 | January | Tuesday | 20 | 19 | 52 |
| 2 | B02617 | 2015-03-19 20:26:00 | B02617 | 161 | March | Thursday | 19 | 20 | 26 |
| 3 | B02764 | 2015-04-10 17:38:00 | B02764 | 107 | April | Friday | 10 | 17 | 38 |
In [ ]:
In [479]:
pivot1 = pd.crosstab(index=uber_15['month'], columns=uber_15['weekday'])
In [480]:
pivot1
Out[480]:
| weekday | Friday | Monday | Saturday | Sunday | Thursday | Tuesday | Wednesday |
|---|---|---|---|---|---|---|---|
| month | |||||||
| April | 2365 | 1833 | 2508 | 2052 | 2823 | 1880 | 2521 |
| February | 2655 | 1970 | 2550 | 2183 | 2396 | 2129 | 2013 |
| January | 2508 | 1353 | 2745 | 1651 | 2378 | 1444 | 1740 |
| June | 2793 | 2848 | 3037 | 2485 | 2767 | 3187 | 2503 |
| March | 2465 | 2115 | 2522 | 2379 | 2093 | 2388 | 2007 |
| May | 3262 | 1865 | 3519 | 2944 | 2627 | 2115 | 2328 |
In [481]:
pivot1.plot(kind='bar')
Out[481]:
<Axes: xlabel='month'>
In [ ]:
In [482]:
summary= uber_15.groupby(['weekday','hour'], as_index=False).size()
In [483]:
summary
Out[483]:
| weekday | hour | size | |
|---|---|---|---|
| 0 | Friday | 0 | 581 |
| 1 | Friday | 1 | 333 |
| 2 | Friday | 2 | 197 |
| 3 | Friday | 3 | 138 |
| 4 | Friday | 4 | 161 |
| ... | ... | ... | ... |
| 163 | Wednesday | 19 | 1044 |
| 164 | Wednesday | 20 | 897 |
| 165 | Wednesday | 21 | 949 |
| 166 | Wednesday | 22 | 900 |
| 167 | Wednesday | 23 | 669 |
168 rows × 3 columns
In [484]:
plt.figure(figsize=(7,5))
sns.pointplot(x="hour", y="size", hue="weekday", data=summary)
Out[484]:
<Axes: xlabel='hour', ylabel='size'>
In [ ]:
In [485]:
print("This point plot provides a detailed view of the average ride size throughout different hours of the day, broken down by each weekday. Key takeaways from this visualization include:\n\nPeak Hours for Uber Rides:\n\nThere is a clear surge in the number of rides during early morning hours (around 7-9 AM) and evening hours (around 5-8 PM). These peaks correspond to common commuting times on weekdays, reflecting a pattern where people are likely using Uber to travel to and from work.\n\nOn weekends, the peak hours tend to shift slightly later in the day, particularly in the late afternoon and evening, suggesting that Uber is used more for leisure or social events during these times.\n\nWeekday vs. Weekend Patterns:\n\nWeekdays (Monday to Friday) exhibit more consistent demand, with pronounced spikes during commuting hours. This suggests that Uber is predominantly used as a transport option for work-related commuting.\n\nOn weekends (Saturday and Sunday), the demand pattern changes. The peak occurs later in the day, and there is a more gradual increase in ride sizes, likely due to people using Uber for leisure activities or social events.\n\nOff-Peak Hours:\n\nLate-night and early-morning hours (between midnight and 6 AM) show consistently lower demand, regardless of the day of the week. However, weekends do show a slight increase in late-night demand, possibly due to late-night outings or events.\n\nConclusion:\n\nThis analysis of ride size by hour and weekday highlights important usage trends, particularly regarding how Uber rides are distributed across different hours and days. The distinct peaks during commuting times on weekdays and later peaks during weekends provide actionable insights into user behavior, which could be useful for understanding demand patterns and optimizing resource allocation for Uber.")
This point plot provides a detailed view of the average ride size throughout different hours of the day, broken down by each weekday. Key takeaways from this visualization include: Peak Hours for Uber Rides: There is a clear surge in the number of rides during early morning hours (around 7-9 AM) and evening hours (around 5-8 PM). These peaks correspond to common commuting times on weekdays, reflecting a pattern where people are likely using Uber to travel to and from work. On weekends, the peak hours tend to shift slightly later in the day, particularly in the late afternoon and evening, suggesting that Uber is used more for leisure or social events during these times. Weekday vs. Weekend Patterns: Weekdays (Monday to Friday) exhibit more consistent demand, with pronounced spikes during commuting hours. This suggests that Uber is predominantly used as a transport option for work-related commuting. On weekends (Saturday and Sunday), the demand pattern changes. The peak occurs later in the day, and there is a more gradual increase in ride sizes, likely due to people using Uber for leisure activities or social events. Off-Peak Hours: Late-night and early-morning hours (between midnight and 6 AM) show consistently lower demand, regardless of the day of the week. However, weekends do show a slight increase in late-night demand, possibly due to late-night outings or events. Conclusion: This analysis of ride size by hour and weekday highlights important usage trends, particularly regarding how Uber rides are distributed across different hours and days. The distinct peaks during commuting times on weekdays and later peaks during weekends provide actionable insights into user behavior, which could be useful for understanding demand patterns and optimizing resource allocation for Uber.
In [ ]:
In [ ]:
In [ ]:
In [486]:
os.listdir(r"/Users/mani/Documents/Data analytics projects/Datasets/")
Out[486]:
['other-Lyft_B02510.csv', 'other-FHV-services_jan-aug-2015.csv', 'other-Firstclass_B01536.csv', 'other-Skyline_B00111.csv', 'uber-raw-data-janjune-15_sample.csv', 'uber-raw-data-janjune-15.csv', 'other-American_B01362.csv', 'uber-raw-data-apr14.csv', 'Uber-Jan-Feb-FOIL.csv', 'other-Highclass_B01717.csv', 'uber-raw-data-aug14.csv', 'uber-raw-data-sep14.csv', 'uber-raw-data-jul14.csv', 'other-Federal_02216.csv', 'uber-raw-data-jun14.csv', 'other-Carmel_B00256.csv', 'other-Diplo_B01196.csv', 'other-Dial7_B00887.csv', 'uber-raw-data-may14.csv', 'other-Prestige_B01338.csv']
In [ ]:
In [487]:
uber_foil=pd.read_csv(r"/Users/mani/Documents/Data analytics projects/Datasets/Uber-Jan-Feb-FOIL.csv")
In [488]:
uber_foil.shape
Out[488]:
(354, 4)
In [489]:
uber_foil.head(3)
Out[489]:
| dispatching_base_number | date | active_vehicles | trips | |
|---|---|---|---|---|
| 0 | B02512 | 1/1/2015 | 190 | 1132 |
| 1 | B02765 | 1/1/2015 | 225 | 1765 |
| 2 | B02764 | 1/1/2015 | 3427 | 29421 |
In [490]:
!pip install chart_studio
!pip install plotly
Requirement already satisfied: chart_studio in /opt/anaconda3/lib/python3.12/site-packages (1.1.0) Requirement already satisfied: plotly in /opt/anaconda3/lib/python3.12/site-packages (from chart_studio) (5.22.0) Requirement already satisfied: requests in /opt/anaconda3/lib/python3.12/site-packages (from chart_studio) (2.32.2) Requirement already satisfied: retrying>=1.3.3 in /opt/anaconda3/lib/python3.12/site-packages (from chart_studio) (1.3.4) Requirement already satisfied: six in /opt/anaconda3/lib/python3.12/site-packages (from chart_studio) (1.16.0) Requirement already satisfied: tenacity>=6.2.0 in /opt/anaconda3/lib/python3.12/site-packages (from plotly->chart_studio) (8.2.2) Requirement already satisfied: packaging in /opt/anaconda3/lib/python3.12/site-packages (from plotly->chart_studio) (23.2) Requirement already satisfied: charset-normalizer<4,>=2 in /opt/anaconda3/lib/python3.12/site-packages (from requests->chart_studio) (2.0.4) Requirement already satisfied: idna<4,>=2.5 in /opt/anaconda3/lib/python3.12/site-packages (from requests->chart_studio) (3.7) Requirement already satisfied: urllib3<3,>=1.21.1 in /opt/anaconda3/lib/python3.12/site-packages (from requests->chart_studio) (2.2.2) Requirement already satisfied: certifi>=2017.4.17 in /opt/anaconda3/lib/python3.12/site-packages (from requests->chart_studio) (2024.7.4) Requirement already satisfied: plotly in /opt/anaconda3/lib/python3.12/site-packages (5.22.0) Requirement already satisfied: tenacity>=6.2.0 in /opt/anaconda3/lib/python3.12/site-packages (from plotly) (8.2.2) Requirement already satisfied: packaging in /opt/anaconda3/lib/python3.12/site-packages (from plotly) (23.2)
In [491]:
import chart_studio.plotly as py
import plotly.graph_objs as go
import plotly.express as px
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
In [492]:
init_notebook_mode(connected=True)
In [493]:
uber_foil.columns
Out[493]:
Index(['dispatching_base_number', 'date', 'active_vehicles', 'trips'], dtype='object')
In [494]:
px.box(x='dispatching_base_number', y='active_vehicles',data_frame=uber_foil)
In [ ]:
In [495]:
print("Observation:\n\nThe box plot visualizes the distribution of active vehicles for each dispatching base, providing insights into the variability and central tendencies of the data across different bases. Key takeaways from this plot include:\n\nVariation in Vehicle Activity Across Bases:\n\nSome dispatching bases have a wider range of active vehicles, indicating greater variability in the number of cars deployed from those bases on any given day.\n\nConversely, some bases may have a more consistent number of active vehicles, as reflected by smaller interquartile ranges (IQR), which indicates a more stable or uniform operational scale.\n\nOutliers and Extreme Values:\n\nThe presence of outliers in certain bases shows that on some occasions, the number of active vehicles might have been significantly higher or lower than usual. These outliers could represent special events, unusual demand surges, or operational anomalies.\n\nIdentifying such outliers can be useful for exploring cases where operational capacity was exceeded or when fewer vehicles were deployed than expected.\n\nComparing Dispatch Bases:\n\nThe comparison across multiple dispatching bases reveals which hubs consistently manage a higher or lower number of active vehicles. Some bases might have a higher median number of active vehicles, suggesting that they are responsible for a larger share of the operations.\n\nThe distribution patterns across bases can help Uber optimize fleet management and ensure that the right number of vehicles are dispatched based on historical data.\n\nOperational Efficiency:\n\nBases with tightly clustered data points (small IQR) likely run more predictably and efficiently, while bases with wider distributions may face operational challenges, such as fluctuating demand or driver availability.\n\nConclusion:\n\nThis box plot helps highlight the distribution and variability in the number of active vehicles across different dispatching bases. By identifying outliers and understanding the spread of data, Uber can fine-tune its resource allocation, address any anomalies, and ensure smoother operations at each base.")
Observation: The box plot visualizes the distribution of active vehicles for each dispatching base, providing insights into the variability and central tendencies of the data across different bases. Key takeaways from this plot include: Variation in Vehicle Activity Across Bases: Some dispatching bases have a wider range of active vehicles, indicating greater variability in the number of cars deployed from those bases on any given day. Conversely, some bases may have a more consistent number of active vehicles, as reflected by smaller interquartile ranges (IQR), which indicates a more stable or uniform operational scale. Outliers and Extreme Values: The presence of outliers in certain bases shows that on some occasions, the number of active vehicles might have been significantly higher or lower than usual. These outliers could represent special events, unusual demand surges, or operational anomalies. Identifying such outliers can be useful for exploring cases where operational capacity was exceeded or when fewer vehicles were deployed than expected. Comparing Dispatch Bases: The comparison across multiple dispatching bases reveals which hubs consistently manage a higher or lower number of active vehicles. Some bases might have a higher median number of active vehicles, suggesting that they are responsible for a larger share of the operations. The distribution patterns across bases can help Uber optimize fleet management and ensure that the right number of vehicles are dispatched based on historical data. Operational Efficiency: Bases with tightly clustered data points (small IQR) likely run more predictably and efficiently, while bases with wider distributions may face operational challenges, such as fluctuating demand or driver availability. Conclusion: This box plot helps highlight the distribution and variability in the number of active vehicles across different dispatching bases. By identifying outliers and understanding the spread of data, Uber can fine-tune its resource allocation, address any anomalies, and ensure smoother operations at each base.
In [ ]:
In [ ]:
In [ ]:
In [496]:
px.violin(x='dispatching_base_number', y='active_vehicles',data_frame=uber_foil)
In [ ]:
In [497]:
print("Observation: A violin plot is generated to compare the distribution of active_vehicles across different dispatching_base_number categories. The plot combines aspects of a box plot and a density plot, helping to visualize both the spread of the data and the probability density of vehicle counts across different dispatching bases.")
Observation: A violin plot is generated to compare the distribution of active_vehicles across different dispatching_base_number categories. The plot combines aspects of a box plot and a density plot, helping to visualize both the spread of the data and the probability density of vehicle counts across different dispatching bases.
In [ ]:
In [ ]:
In [498]:
os.listdir(r"/Users/mani/Documents/Data analytics projects/Datasets/")
Out[498]:
['other-Lyft_B02510.csv', 'other-FHV-services_jan-aug-2015.csv', 'other-Firstclass_B01536.csv', 'other-Skyline_B00111.csv', 'uber-raw-data-janjune-15_sample.csv', 'uber-raw-data-janjune-15.csv', 'other-American_B01362.csv', 'uber-raw-data-apr14.csv', 'Uber-Jan-Feb-FOIL.csv', 'other-Highclass_B01717.csv', 'uber-raw-data-aug14.csv', 'uber-raw-data-sep14.csv', 'uber-raw-data-jul14.csv', 'other-Federal_02216.csv', 'uber-raw-data-jun14.csv', 'other-Carmel_B00256.csv', 'other-Diplo_B01196.csv', 'other-Dial7_B00887.csv', 'uber-raw-data-may14.csv', 'other-Prestige_B01338.csv']
In [499]:
files = os.listdir(r"/Users/mani/Documents/Data analytics projects/Datasets/")[7:19]
In [500]:
files
Out[500]:
['uber-raw-data-apr14.csv', 'Uber-Jan-Feb-FOIL.csv', 'other-Highclass_B01717.csv', 'uber-raw-data-aug14.csv', 'uber-raw-data-sep14.csv', 'uber-raw-data-jul14.csv', 'other-Federal_02216.csv', 'uber-raw-data-jun14.csv', 'other-Carmel_B00256.csv', 'other-Diplo_B01196.csv', 'other-Dial7_B00887.csv', 'uber-raw-data-may14.csv']
In [501]:
files.remove('uber-raw-data-apr14.csv')
In [502]:
files.remove('Uber-Jan-Feb-FOIL.csv')
In [503]:
files.remove('other-Highclass_B01717.csv')
In [504]:
files.remove('other-Federal_02216.csv')
In [505]:
files.remove('other-Carmel_B00256.csv')
In [506]:
files.remove('other-Diplo_B01196.csv')
In [507]:
files.remove('uber-raw-data-may14.csv')
In [508]:
files
Out[508]:
['uber-raw-data-aug14.csv', 'uber-raw-data-sep14.csv', 'uber-raw-data-jul14.csv', 'uber-raw-data-jun14.csv', 'other-Dial7_B00887.csv']
In [509]:
final = pd.DataFrame()
path = r"/Users/mani/Documents/Data analytics projects/Datasets/"
for file in files:
current_df = pd.read_csv(path+'/'+file)
final = pd.concat([current_df , final])
In [510]:
final.shape
Out[510]:
(3512368, 10)
In [511]:
final.duplicated().sum()
Out[511]:
67003
In [512]:
final.drop_duplicates(inplace=True)
In [513]:
final.shape
Out[513]:
(3445365, 10)
In [514]:
final.head(3)
Out[514]:
| Date | Time | State | PuFrom | Address | Street | Date/Time | Lat | Lon | Base | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2014.07.06 | 14:30 | NY ... | MANHATTAN | 50 | MURRAY ST | NaN | NaN | NaN | NaN |
| 1 | 2014.07.04 | 7:15 | NY ... | MANHATTAN | 143 | AVENUE B | NaN | NaN | NaN | NaN |
| 2 | 2014.07.05 | 5:45 | NY ... | MANHATTAN | 125 | CHRISTOPHER ST | NaN | NaN | NaN | NaN |
In [ ]:
In [515]:
rush_uber = final.groupby(['Lat','Lon'], as_index=False).size()
In [516]:
rush_uber.head(3)
Out[516]:
| Lat | Lon | size | |
|---|---|---|---|
| 0 | 39.6569 | -74.2258 | 1 |
| 1 | 39.6686 | -74.1607 | 1 |
| 2 | 39.7214 | -74.2446 | 1 |
In [ ]:
In [517]:
!pip install folium
Requirement already satisfied: folium in /opt/anaconda3/lib/python3.12/site-packages (0.17.0) Requirement already satisfied: branca>=0.6.0 in /opt/anaconda3/lib/python3.12/site-packages (from folium) (0.7.2) Requirement already satisfied: jinja2>=2.9 in /opt/anaconda3/lib/python3.12/site-packages (from folium) (3.1.4) Requirement already satisfied: numpy in /opt/anaconda3/lib/python3.12/site-packages (from folium) (1.26.4) Requirement already satisfied: requests in /opt/anaconda3/lib/python3.12/site-packages (from folium) (2.32.2) Requirement already satisfied: xyzservices in /opt/anaconda3/lib/python3.12/site-packages (from folium) (2022.9.0) Requirement already satisfied: MarkupSafe>=2.0 in /opt/anaconda3/lib/python3.12/site-packages (from jinja2>=2.9->folium) (2.1.3) Requirement already satisfied: charset-normalizer<4,>=2 in /opt/anaconda3/lib/python3.12/site-packages (from requests->folium) (2.0.4) Requirement already satisfied: idna<4,>=2.5 in /opt/anaconda3/lib/python3.12/site-packages (from requests->folium) (3.7) Requirement already satisfied: urllib3<3,>=1.21.1 in /opt/anaconda3/lib/python3.12/site-packages (from requests->folium) (2.2.2) Requirement already satisfied: certifi>=2017.4.17 in /opt/anaconda3/lib/python3.12/site-packages (from requests->folium) (2024.7.4)
In [518]:
import folium
In [519]:
basemap = folium.Map()
In [520]:
basemap
Out[520]:
Make this Notebook Trusted to load map: File -> Trust Notebook
In [ ]:
In [521]:
from folium.plugins import HeatMap
In [522]:
HeatMap(rush_uber).add_to(basemap)
Out[522]:
<folium.plugins.heat_map.HeatMap at 0x312a23b60>
In [523]:
basemap
Out[523]:
Make this Notebook Trusted to load map: File -> Trust Notebook